2. Extracting sex-specific PAR gene sequences
2.1 Alignment and variant calling
Samples listed in Table S1 were aligned to the reference genome constructed from the male sample of each species, using a snakemake pipeline.
from snakemake.utils import min_version
import os
min_version("4.4.0")
# Set directory paths
dir_path = os.getcwd()
###############################################
################## PATHS ######################
###############################################
ID = config["samples"]
SPECIES = config["species"]
FEMALE = config["female"]
MALE = config["male"]
FQ_DIR = config["fastq"]
REF_SPECIES = config["ref_species"]
REF_DIR = config["ref_dir"]
REF_NAME = config["ref_name"]
PREFIX = SPECIES + "_ref_" + REF_SPECIES
REF_PATH = REF_DIR + REF_NAME
REF_FASTA = REF_DIR + REF_NAME + ".fasta"
MAP_DIR = "intermediate/bwa/" + PREFIX + "/"
GENCOV_DIR = "intermediate/bedtools" + PREFIX + "/"
VCF_DIR = "intermediate/freebayes" + PREFIX + "/"
RESULTDIR = "results/" + PREFIX + "/"
###############################################
################## RULES ######################
###############################################
rule all:
input:
REF_FASTA + ".bwt",
REF_FASTA + ".fai",
expand(MAP_DIR + "{S}" + ".sorted.status", S = ID),
expand(MAP_DIR + "{S}" + ".sorted.nodup.status", S = ID),
expand(MAP_DIR + "{S}" + ".sorted.nodup.bam.bai", S = ID),
VCF_DIR + SPECIES + ".vcf.status",
expand(MAP_DIR + "{S}" + ".sorted.flagstat", S = ID),
expand(MAP_DIR + "{S}" + ".sorted.nodup.flagstat", S = ID),
VCF_DIR + SPECIES + ".non-ref-ac_2_biallelic_qual.vcf",
VCF_DIR + SPECIES + ".non-ref-ac_2_biallelic_qual.vcf.gz"
##########################################################
##################### INDEX GENOME #######################
##########################################################
rule index_fasta_bwa:
input:
ref = REF_FASTA
output:
ref_bwt = REF_FASTA + ".bwt"
priority: 80
message: "Indexing {input} with BWA index."
threads: 2
shell:
"""
bwa index {input}
"""
rule index_fasta_samtools:
input:
ref = REF_FASTA
output:
ref_fai = REF_FASTA + ".fai"
priority: 70
threads: 2
shell:
"""
samtools faidx {input}
"""
##########################################################
######################## MAPPING #########################
##########################################################
rule map:
input:
R1 = FQ_DIR + "{S}_forward_paired.fq.gz",
R2 = FQ_DIR + "{S}_reverse_paired.fq.gz",
ref = REF_FASTA,
ref_bwt = REF_FASTA + ".bwt"
output:
temp(MAP_DIR + "{S}" + ".bam")
message: "Mapping reads to ref"
threads: 20
params:
rg = "\"@RG\\tID:{S}\\tSM:{S}\""
shell:
"""
bwa mem -t {threads} -M -R {params.rg} {input.ref} {input.R1} {input.R2} | samtools view -Sb - > {output}
"""
rule sort_bam:
input:
MAP_DIR + "{S}" + ".bam"
output:
out = temp(MAP_DIR + "{S}" + ".sorted.bam"),
log = MAP_DIR + "{S}" + ".sorted.status"
threads: 15
params:
tmpdir = MAP_DIR + "{S}" + "_temp_sort/"
shell:
"""
mkdir {params.tmpdir}
samtools sort -@ {threads} {input} -T {params.tmpdir} > {output.out}
rm -r {params.tmpdir}
echo "DONE" > {output.log}
"""
rule remove_duplicates:
input:
MAP_DIR + "{S}" + ".sorted.bam"
output:
out = MAP_DIR + "{S}" + ".sorted.nodup.bam",
log = MAP_DIR + "{S}" + ".sorted.nodup.status"
params:
tmpdir = MAP_DIR + "{S}" + "_temp_dupl/"
shell:
"""
mkdir {params.tmpdir}
picard MarkDuplicates MAX_FILE_HANDLES=500 REMOVE_DUPLICATES=true I={input} O={output.out} M={input}_duplicatedata.txt TMP_DIR={params.tmpdir}
rm -r {params.tmpdir}
echo "DONE" > {output.log}
"""
rule index_bam:
input:
MAP_DIR + "{S}" + ".sorted.nodup.bam"
output:
MAP_DIR + "{S}" + ".sorted.nodup.bam.bai"
threads: 1
shell:
"""
samtools index {input}
"""
##########################################################
#################### GENOME COVERAGE #####################
##########################################################
rule gencov_prepare_fasta:
input:
ref_fai = REF_FASTA + ".fai"
output:
GENCOV_DIR + "genome_5kb_windows.out"
threads: 1
shell:
"""
bedtools makewindows -g {input} -w 5000 -s 5000 > {output}
"""
rule gencov_bedtoolsall:
input:
bam_f = MAP_DIR + FEMALE + ".sorted.nodup.bam",
bai_f = MAP_DIR + FEMALE + ".sorted.nodup.bam.bai",
bam_m = MAP_DIR + MALE + ".sorted.nodup.bam",
bai_m = MAP_DIR + MALE + ".sorted.nodup.bam.bai",
bed = GENCOV_DIR + "genome_5kb_windows.out"
output:
GENCOV_DIR + "gencov.nodup.nm.all.out"
threads: 2
shell:
"""
bedtools multicov -bams {input.bam_f} {input.bam_m} -bed {input.bed} > {output}
"""
##########################################################
#################### VARIANT CALLING #####################
##########################################################
rule freebayes_prep:
input:
ref_fai = REF_FASTA + ".fai"
output:
VCF_DIR + SPECIES + ".100kbp.regions"
threads: 4
shell:
"""
fasta_generate_regions.py {input} 100000 > {output}
"""
rule freebayes_parallel:
input:
ref = REF_FASTA,
regions = VCF_DIR + SPECIES + ".100kbp.regions",
f = MAP_DIR + FEMALE + ".sorted.nodup.bam",
m = MAP_DIR + MALE + ".sorted.nodup.bam"
output:
vcf = VCF_DIR + SPECIES + ".vcf",
log = VCF_DIR + SPECIES + ".vcf.status"
threads: 18
params:
tmpdir = VCF_DIR + "temp/"
shell:
"""
mkdir {params.tmpdir}
export TMPDIR={params.tmpdir}
freebayes-parallel {input.regions} {threads} -f {input.ref} {input.f} {input.m} > {output.vcf}
rm -r {params.tmpdir}
echo "DONE" > {output.log}
"""
rule vcftools_singletons:
input:
VCF_DIR + SPECIES + ".vcf"
output:
VCF_DIR + SPECIES + ".singletons.bed"
threads: 1
shell:
"""
vcftools --vcf {input} --singletons --remove-filtered-geno-all --minQ 20 --minDP 3 --stdout | awk -v OFS="\\t" '{{print $1,$2,$2+1,$3,$4,$5}}' > {output}
"""2.2 Extracting gene sequences
# Set directories and file paths
krakendir="kraken/bTaeGut1.pat.W.v2/"
genes="ZF_PAR.genes.new.list"
home="/cfs/klemming/projects/supr/snic2020-2-25/user_data/hanna/sylvioidea_sexchromosome/supplementary_code_testing"
# Step 1: Create BED file for PAR genes ordered per exon
ls $krakendir/ | while read sp; do
cat $genes | while read gene; do
grep $gene $krakendir/${sp}/mapped.gtf | \
awk '$3=="exon" {print}' | \
cut -f 1,4,5,9 | \
sed 's/;/\t/g' | \
awk '{print $1,$2,$3,$5,$7,$9}' | \
tr -d "\"" | \
sed 's/ /\t/g' | \
sort -k5,5 -k6,6g
done > $krakendir/${sp}/${sp}.PAR.genes.bed
done
# Step 2: Extract lines from the GTF file in the same order
ls $krakendir/ | while read sp; do
cat $krakendir/${sp}/${sp}.PAR.genes.bed | while read scaff start end gene trans exonnr; do
cat $krakendir/${sp}/mapped.gtf | \
awk '{ if ($1=="'"$scaff"'" && $4=="'"$start"'" && $5=="'"$end"'" && $3=="exon") print $0}' | \
grep $trans
done > $krakendir/${sp}/${sp}.PAR.genes.gtf
done
# Step 3: Convert GTF to BED12 format and sort
ls $krakendir/ | while read sp; do
gtf2bed --do-not-sort < $krakendir/${sp}/${sp}.PAR.genes.gtf > $krakendir/${sp}/${sp}.PAR.genes.sorted.bed
done
# Step 4: Generate FASTA sequences for PAR genes
cat samples_ref_genome.list | while read sp ref; do
cd $krakendir/${sp}
# Split BED file into separate list files by gene
awk '{print >> $4 ".list"; close($4)}' ${sp}.PAR.genes.sorted.bed
# Extract FASTA sequences using bedtools
ls | grep list | while read gene; do
echo ">$gene"
bedtools getfasta -fi $home/../data/internal_raw/genome/${ref}.fasta -bed $gene -s -name | \
grep -v ">" | tr -d "\n"
echo
done | sed 's/.list//' > ${sp}.PAR.fasta
cd $home
done
# Step 5: Consensus sequence for male and female using VCF
cat samples_sex_sameline_ref.tsv | while read female male sp ref; do
# Filter and process VCF files
vcftools --vcf intermediate/freebayes_17nov2019_parallel/${sp}_ref_${sp}/${sp}.vcf \
--non-ref-ac 1 --min-alleles 2 --max-alleles 2 --remove-filtered-all \
--recode --stdout --minQ 20 --minDP 5 --bed $krakendir/${sp}/${sp}.PAR.genes.bed > \
$krakendir/${sp}/${sp}.filt.vcf
# Compress and index VCF
bgzip -c $krakendir/${sp}/${sp}.filt.vcf > $krakendir/${sp}/${sp}.filt.vcf.gz
tabix -p vcf $krakendir/${sp}/${sp}.filt.vcf.gz
done
# Step 6: Generate consensus sequences for male and female
cat samples_sex_sameline_ref.tsv | while read female male sp ref; do
# Create female and male consensus FASTA sequences
cat $home/../data/internal_raw/genome/${ref}.fasta | \
bcftools consensus $krakendir/${sp}/${sp}.filt.vcf.gz -s $female > \
$krakendir/${sp}/${sp}.female.fasta
cat $home/../data/internal_raw/genome/${ref}.fasta | \
bcftools consensus $krakendir/${sp}/${sp}.filt.vcf.gz -s $male > \
$krakendir/${sp}/${sp}.male.fasta
done
# Step 7: Separate exons into separate sequences for male and female
cat samples_ref_genome.list | while read sp ref; do
# Create exon-specific BED file for PAR genes
cat $krakendir/${sp}/${sp}.PAR.genes.sorted.bed | \
awk '{print $1,$2,$3,$4"_"$15,$5,$6}' | \
sed 's/ /\t/g' | tr -d ";\"" > $krakendir/${sp}/${sp}.PAR.exonSeparate.genes.sorted.bed
cd $krakendir/${sp}
# Split the exons into list files
awk '{print >> $4 ".list"; close($4)}' ${sp}.PAR.exonSeparate.genes.sorted.bed
# Extract FASTA for female exons
bedtools getfasta -fi ${sp}.female.fasta -bed ${sp}.PAR.exonSeparate.genes.sorted.bed -s -name | \
sed 's/(+)//' | sed 's/(-)//' | sed 's/::/\t/' | cut -f 1 > ${sp}.PAR.exonSeparate.female.fasta
# Extract FASTA for male exons
bedtools getfasta -fi ${sp}.male.fasta -bed ${sp}.PAR.exonSeparate.genes.sorted.bed -s -name | \
sed 's/(+)//' | sed 's/(-)//' | sed 's/::/\t/' | cut -f 1 > ${sp}.PAR.exonSeparate.male.fasta
# Clean up list files
cd $home
done
# Step 8: Reverse complement sequences in negative orientation
module load Fastx/0.0.14
oneline_fasta() {
awk '/^>/ {printf("\n%s\n",$0);next;} { printf("%s",$0);} END {printf("\n");}' "$1"
}
ls $krakendir | while read sp ; do cat $krakendir/${sp}/${sp}.PAR.genes.sorted.bed | sed 's/ /\t/g' | awk '{print $1,$6,$4"_"$15}' | tr -d ";\"" | sed 's/ /\t/g' | awk '$2=="-" {print}' | while read scaff orientation exon ; do oneline_fasta $krakendir/${sp}/${sp}.exonSeparate.male.female.fasta | grep $exon$ -A 1 | grep -v "^--" | awk '{ if ($0 !~ />/) {print toupper($0)} else {print $0} }' | fastx_reverse_complement ; done ; done > allSp.PAR.exonSeparate.female.male.outgroup.fasta
ls $krakendir | while read sp ; do cat $krakendir/${sp}/${sp}.PAR.genes.sorted.bed | sed 's/ /\t/g' | awk '{print $1,$6,$4"_"$15}' | tr -d ";\"" | sed 's/ /\t/g' | awk '$2=="+" {print}' | while read scaff orientation exon ; do oneline_fasta $krakendir/${sp}/${sp}.exonSeparate.male.female.fasta | grep $exon$ -A 1 | grep -v "^--" | awk '{ if ($0 !~ />/) {print toupper($0)} else {print $0} }' ; done ; done >> allSp.PAR.exonSeparate.female.male.outgroup.fasta
# Step 9: Make multi-species fasta files for each gene
cat allSp.PAR.exonSeparate.female.male.outgroup.fasta | grep ">" | sed 's/male_/\t/' | cut -f 2 | sort | uniq > allPARexons.outgroup.list
mkdir exonSeparate_outgroup
cat allSp.PAR.exonSeparate.female.male.outgroup.fasta | grep ">" | sed 's/male_/\t/' | cut -f 2 | sort | uniq | while read gene ; do oneline_fasta allSp.PAR.exonSeparate.female.male.outgroup.fasta | grep ${gene}$ -A 1 | grep -v "^--" | sed 's/SylAtr_1EV02922/SylAtr/' > exonSeparate_outgroup/${gene}.fasta ; done2.3 Calculating sequencing depth
krakendir="kraken/bTaeGut1.pat.W.v2/"
cat scratch/PAR/local_species_path.tsv | while read path female male sp ; do echo "java -jar ~/bin/jvarkit/dist/bamstats04.jar $path/$female.sorted.nodup.bam $path/$male.sorted.nodup.bam --bed $krakendir/$sp/${sp}.Z.genes.bed > $krakendir/$sp/${sp}.Z.genes.bamstat04.out" ; done
ls $krakendir | while read sp ; do cat $krakendir/$sp/${sp}.Z.genes.bamstat04.out | awk '{print $0 "\t" "'"$sp"'"}' ; done > allSp.Z.genes.bamstat04.out
# And lastly summarize
bedtools intersect -a allSp.Z.genes.bamstat04.out -b allSp.Z.genes.bed -wa -wb | cut -f 5,6,7,8,9,12,16,17 > allSp.Z.genes.bamstat04.geneInfo.out2.4 Private allele (singleton) analysis
ls $krakendir | while read sp ; do vcftools --vcf $krakendir/${sp}/${sp}.filt.vcf --singletons --out $krakendir/${sp}/${sp}.filt.singletons ; done
ls $krakendir | while read sp ; do cat $krakendir/${sp}/${sp}.filt.singletons.singletons | grep -v CHROM | awk '{print $1,$2-1,$2,$3,$4,$5}' | sed 's/ /\t/g' | bedtools intersect -a $krakendir/$sp/${sp}.Z.genes.bed -b stdin -wa -wb > $krakendir/$sp/${sp}.Z.genes.singletons.krakenSNPs.bed ; done
ls $krakendir | while read sp ; do cat $krakendir/$sp/${sp}.Z.genes.singletons.krakenSNPs.bed | awk '{print $0 "\t" "'"$sp"'"}' ; done > results/PAR/allSp.Z.genes.singletons.krakenSNPs.bed
cat data/meta/samples_sex.tsv | while read sample sp sex ; do cat results/PAR/allSp.Z.genes.singletons.krakenSNPs.bed | grep $sample | awk '{print $0 "\t" "'"$sex"'"}' ; done | awk '$10=="S" {print}' | cut -f 4,12,13,14 | sort | uniq -c | grep -v GalMod | awk '{print $1,$2,$3,$4,$5}' | sed 's/ /\t/g' > results/PAR/allSp.Z.genes.singletons.S.sum.krakenSNPs.out
# Lastly, transfer to laptop for plotting
scp hannas@rackham.uppmax.uu.se:/proj/sllstore2017102/nobackup/hanna/sylvioidea_sexchromosome/results/PAR/allSp.Z.genes.singletons.S.sum.krakenSNPs.out .